sc_trim_barcodeassume read1 contains the transcript
N in its barcode or UMI:
FALSEsc_exon_mappingsc_demultiplexsc_gene_countingThe organism is “hsapiens_gene_ensembl”, and gene id type is “ensembl_gene_id”.
if (is.null(params$organism) || is.na(params$organism)) {
sce = create_sce_by_dir(params$outdir)
} else {
sce = create_sce_by_dir(params$outdir, organism=params$organism, gene_id_type=params$gene_id_type)
}
overall_stat = demultiplex_info(sce)
datatable(overall_stat, width=800)
Plot barcode match statistics in pie chart:
plot_demultiplex(sce)
ggplotly(plot_mapping(sce, dataname=params$samplename, percentage = FALSE))
ggplotly(plot_mapping(sce, dataname=params$samplename, percentage = TRUE))
if (any(colSums(counts(sce)) == 0)) {
zero_cells = sum(colSums(counts(sce)) == 0)
sce = sce[, colSums(counts(sce)) > 0]
} else {
zero_cells = 0
}
Datatable of all QC metrics:
sce = calculate_QC_metrics(sce)
no ERCC spike-in. Skip `non_ERCC_percent`
if(!all(colSums(as.data.frame(QC_metrics(sce)))>0)){
QC_metrics(sce) = QC_metrics(sce)[, colSums(as.data.frame(QC_metrics(sce)))>0]
}
datatable(as.data.frame(QC_metrics(sce)), width=800, options=list(scrollX= TRUE))
Summary of all QC metrics:
datatable(do.call(cbind, lapply(QC_metrics(sce), summary)), width=800, options=list(scrollX= TRUE))
Number of reads mapped to exon before UMI deduplication VS number of genes detected:
ggplotly(ggplot(as.data.frame(QC_metrics(sce)), aes(x=mapped_to_exon, y=number_of_genes))+geom_point(alpha=0.8))
A robustified Mahalanobis Distance is calculated for each cell then
outliers are detected based on the distance. However, due to the complex
nature of single cell transcriptomes and protocol used, such a method
can only be used to assist the quality control process. Visual
inspection of the quality control metrics is still required. By default
we use comp = 2 and the algorithm will try to separate the
quality control metrics into two gaussian clusters.
The number of outliers:
sce_qc = detect_outlier(sce, type="low", comp = 2)
the following QC metrics not found in colData from sce: 'non_mt_percent' and 'non_ERCC_percent'
table(QC_metrics(sce_qc)$outliers)
FALSE TRUE
220 156
Pairwise plot for QC metrics, colored by outliers:
plot_QC_pairs(sce_qc)
Remove low quality cells and plot highest expression genes.
sce_qc = remove_outliers(sce_qc)
sce_qc = convert_geneid(sce_qc, returns="external_gene_name")
Number of NA in new gene id: 5724. Duplicated id: -0.5
sce_qc <- calculate_QC_metrics(sce_qc)
no ERCC spike-in. Skip `non_ERCC_percent`
plotHighestExprs(sce_qc, n=20)
Plot the average count for each genes:
ave.counts <- rowMeans(counts(sce_qc))
hist(log10(ave.counts), breaks=100, main="", col="grey80",
xlab=expression(Log[10]~"average count"))
As a loose filter we keep genes that are expressed in at least two cells and for cells that express that gene, the average count larger than two. However this is not the gold standard and the filter may variy depending on the data.
keep1 = (apply(counts(sce_qc), 1, function(x) mean(x[x>0])) > 1.1) # average count larger than 1.1
keep2 = (rowSums(counts(sce_qc)>0) > 5) # expressed in at least 5 cells
sce_qc = sce_qc[(keep1 & keep2), ]
dim(sce_qc)
[1] 5587 220
We got 5587 genes left after removing low abundant genes.
scran and scaterCompute the normalization size factor
ncells = ncol(sce_qc)
if (ncells > 200) {
sce_qc <- computeSumFactors(sce_qc)
} else {
sce_qc <- computeSumFactors(sce_qc, sizes=as.integer(c(ncells/7, ncells/6, ncells/5, ncells/4, ncells/3)))
}
summary(sizeFactors(sce_qc))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1683 0.4156 0.7546 1.0000 1.2525 5.9488
if (min(sizeFactors(sce_qc)) <= 0) {
sce_qc = sce_qc[, sizeFactors(sce_qc)>0]
}
PCA plot using gene expressions as input, colored by the number of genes.
cpm(sce_qc) = calculateCPM(sce_qc, size.factors=NULL)
sce_qc <- runPCA(sce_qc, exprs_values = "cpm")
plotPCA(sce_qc, colour_by="number_of_genes")
The highly variable genes are chosen based on trendVar
from scran with FDR > 0.05 and biological
variation larger than 0.5. If the number of highly variable
genes is smaller than 100 we will select the top 100 genes by biological
variation. If the number is larger than 500 we will only keep top 500
genes by biological variation.
sce_qc <- logNormCounts(sce_qc)
var.out <- modelGeneVar(sce_qc)
if (length(which(var.out$FDR <= 0.05 & var.out$bio >= 0.5)) < 500){
hvg.out <- var.out[order(var.out$bio, decreasing=TRUE)[1:500], ]
}else if(length(which(var.out$FDR <= 0.05 & var.out$bio >= 0.5)) > 1000){
hvg.out <- var.out[order(var.out$bio, decreasing=TRUE)[1:1000], ]
}else{
hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5), ]
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE), ]
}
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
points(var.out$mean[rownames(var.out) %in% rownames(hvg.out)], var.out$total[rownames(var.out) %in% rownames(hvg.out)], col="red", pch=16)
gene_exp = exprs(sce_qc)
gene_exp = gene_exp[rownames(hvg.out)[1:100], ]
hc.rows <- hclust(dist(gene_exp))
hc.cols <- hclust(dist(t(gene_exp)))
gene_exp = gene_exp[hc.rows$order, hc.cols$order]
m = list(
l = 100,
r = 40,
b = 10,
t = 10,
pad = 0
)
plot_ly(
x = colnames(gene_exp), y = rownames(gene_exp),
z = gene_exp, type = "heatmap")%>%
layout(autosize = F, margin = m)
sce_qc <- runPCA(sce_qc, exprs_values = "logcounts")
plotPCA(sce_qc, colour_by="number_of_genes")
set.seed(100)
if (any(duplicated(t(logcounts(sce_qc)[rownames(hvg.out), ])))) {
sce_qc = sce_qc[, !duplicated(t(logcounts(sce_qc)[rownames(hvg.out), ]))]
}
sce_qc <- runTSNE(sce_qc, exprs_values="logcounts", perplexity=10,feature_set=rownames(hvg.out))
out5 <- plotTSNE(sce_qc, colour_by="number_of_genes") + ggtitle("Perplexity = 10")
sce_qc <- runTSNE(sce_qc, exprs_values="logcounts", perplexity=20,feature_set=rownames(hvg.out))
out10 <- plotTSNE(sce_qc, colour_by="number_of_genes") + ggtitle("Perplexity = 20")
sce_qc <- runTSNE(sce_qc, exprs_values="logcounts", perplexity=30,feature_set=rownames(hvg.out))
out20 <- plotTSNE(sce_qc, colour_by="number_of_genes") + ggtitle("Perplexity = 30")
gridExtra::grid.arrange(out5, out10, out20, ncol = 3)